Objective

TBW…

Load libraries

rm(list = ls())
#install.packages("tidyverse")
#install.packages("DataExplorer")
#install.packages("mice")
#install.packages("ggplot2")
#install.packages("RColorBrewer")
#install.packages("explore")
#install.packages("corrplot")
#install.packages("factoextra")
#install.packages("rworldmap")
#install.packages("tidytext")
#install.packages("ggsci")
library(tidyverse)
library(DataExplorer)
library(mice)
library(ggplot2)
library(RColorBrewer)
library(explore)
library(corrplot)
library(factoextra)
library(rworldmap)
library(tidytext)
library(ggsci)

Read the data

I have carefully handpicked variables that can help us characterize different economic structures from the World Development Indicators of the World Bank.

The dataset includes:

I downloaded data for years 2015 to 2019 so that by averaging over those years I am able to reduce the number of missing values but still retain a significant representation of the economic structure of different countries in a fairly recent period.

raw_data <- read_csv("WDI_data.csv", show_col_types = FALSE)
head(raw_data)
## # A tibble: 6 × 9
##   `Country Name` `Country Code` `Series Name`      `Series Code` `2015 [YR2015]`
##   <chr>          <chr>          <chr>              <chr>         <chr>          
## 1 Afghanistan    AFG            Agriculture, fore… NV.AGR.TOTL.… 20.63432271667…
## 2 Afghanistan    AFG            Industry (includi… NV.IND.TOTL.… 22.12404249069…
## 3 Afghanistan    AFG            Services, value a… NV.SRV.TOTL.… 53.23529349865…
## 4 Afghanistan    AFG            External balance … NE.RSB.GNFS.… ..             
## 5 Afghanistan    AFG            Trade (% of GDP)   NE.TRD.GNFS.… ..             
## 6 Afghanistan    AFG            Central governmen… GC.DOD.TOTL.… ..             
## # ℹ 4 more variables: `2016 [YR2016]` <chr>, `2017 [YR2017]` <chr>,
## #   `2018 [YR2018]` <chr>, `2019 [YR2019]` <chr>

Data can also be obtained the following way, directly from R by talking to the World Bank’s API. If downloaded this way we would need to subsequently subset the dataframe, obtaining just the countries, as by running WDI(country = "all"...) we obtain data also for subnational geographical entities (ex. Africa Eastern and Southern etc.).

# indicators_list <- raw_data |>
#   distinct(`Series Code`) |> 
#   drop_na() |>
#   pull(`Series Code`) 
# library(WDI)
# df = WDI(country = "all", indicator = indicators_list, start = 2015, end = 2019, extra = TRUE) 

Cleaning and pre-processing

data <- raw_data |> 
  # Dropping code of the indicator
  select(- "Series Code") |> 
  # Converting columns to numeric
  mutate(across(contains("YR"), as.numeric)) |> 
  # Obtaining the mean value for the 2015 to 2019 period
  mutate(mean_value = rowMeans(across(contains("YR")), na.rm = TRUE)) |> 
  # Dropping captions in the dataset
  slice(1:5208) |> 
  # Dropping columns with single year data
  select(-contains("YR")) |> 
  # Correctly encode NaN as NA
  mutate(mean_value = ifelse(is.nan(mean_value), NA, mean_value)) |>
  # Pivoting wider
  pivot_wider(names_from = "Series Name", values_from = "mean_value")

# Extracting original variables names
variables_df <- colnames(data)

# Modifying column names
colnames(data) <- c(
  "country_name",  
  "country_code",  
  "agriculture_%gdp",  
  "industry_%gdp",  
  "services_%gdp",  
  "trade_balance",  
  "trade_openess",  
  "public_debt",  
  "public_spending",  
  "unemployment_total",  
  "unemployment_youth",  
  "foreign_aid",  
  "natural_resources_rents",  
  "tourism_exports",  
  "rawagrimaterial_exports",  
  "food_exports",  
  "fuel_exports",  
  "manufactures_exports",  
  "ores_metals_exports",  
  "ict_services_exports",  
  "finance_services_exports",  
  "transport_services_exports",  
  "travel_services_exports",  
  "female_labor_participation",  
  "migrant_population",  
  "age_dependency"
)

# Creating a dataframe with the new variable name and the original definition
variables_df <- cbind(variables_df, colnames(data)) |> 
  as.tibble() |> 
  rename(original = variables_df, shortened = V2)

str(data)
## tibble [217 × 26] (S3: tbl_df/tbl/data.frame)
##  $ country_name              : chr [1:217] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##  $ country_code              : chr [1:217] "AFG" "ALB" "DZA" "ASM" ...
##  $ agriculture_%gdp          : num [1:217] 24.122 18.946 11.071 NA 0.533 ...
##  $ industry_%gdp             : num [1:217] 14 22 33.5 NA 10.7 ...
##  $ services_%gdp             : num [1:217] 57.1 46.2 51.1 NA 78 ...
##  $ trade_balance             : num [1:217] NA -15.23 -9.07 -35.65 NA ...
##  $ trade_openess             : num [1:217] NA 75.2 50.3 162.3 NA ...
##  $ public_debt               : num [1:217] NA 75 NA NA NA ...
##  $ public_spending           : num [1:217] NA 11.7 19 NA NA ...
##  $ unemployment_total        : num [1:217] 10.5 14 11.6 NA NA ...
##  $ unemployment_youth        : num [1:217] 15.3 32.4 29.8 NA NA ...
##  $ foreign_aid               : num [1:217] 21.3789 1.6073 0.0758 NA NA ...
##  $ natural_resources_rents   : num [1:217] 0.666 1.497 16.472 0 0 ...
##  $ tourism_exports           : num [1:217] 4.326 50.708 0.585 NA 81.802 ...
##  $ rawagrimaterial_exports   : num [1:217] 17.1903 1.2038 0.0379 NA 1.5958 ...
##  $ food_exports              : num [1:217] 61.257 8.609 0.929 NA 0.757 ...
##  $ fuel_exports              : num [1:217] 6.7373 4.6142 95.7541 NA 0.0336 ...
##  $ manufactures_exports      : num [1:217] 6.95 54.02 3.04 NA 90.66 ...
##  $ ores_metals_exports       : num [1:217] 1.288 5.308 0.242 NA 3.284 ...
##  $ ict_services_exports      : num [1:217] 65.12 28.25 61.07 NA 9.03 ...
##  $ finance_services_exports  : num [1:217] 3.182 0.371 11.415 NA 3.141 ...
##  $ transport_services_exports: num [1:217] 23.58 7.98 21.64 NA 1.76 ...
##  $ travel_services_exports   : num [1:217] 8.12 63.4 5.87 NA 86.07 ...
##  $ female_labor_participation: num [1:217] 20.5 58.7 16.2 NA NA ...
##  $ migrant_population        : num [1:217] 1.176 1.989 0.611 41.802 59.714 ...
##  $ age_dependency            : num [1:217] 4.52 19.54 8.48 8.86 18.44 ...
# Check correlation
plot_correlation(data, type = "continuous", cor_args = list("use" = "pairwise.complete.obs"))

I confirm that the variables appear to be correlated at least to a certain extent and that therefore are appropriate for this assignment.

Missing Data

plot_intro(data)

First let’s check if there are some countries with a particularly high number of missing data

# Constructing a dataset counting how many missing data each country has 
missing_counts <- data.frame(
  country_name = data$country_name,
  missing_count = rowSums(is.na(data[, -c(1, 2)]))
)

# Extracting the countries with at least 50% of missing data (12 out of 24 variables)
high_missing_countries <- missing_counts |> 
  filter(missing_count >= 12) |> 
  pull(country_name)
high_missing_countries
##  [1] "American Samoa"            "British Virgin Islands"   
##  [3] "Channel Islands"           "Eritrea"                  
##  [5] "Faroe Islands"             "Gibraltar"                
##  [7] "Guam"                      "Isle of Man"              
##  [9] "Korea, Dem. People's Rep." "Liechtenstein"            
## [11] "Micronesia, Fed. Sts."     "Monaco"                   
## [13] "Nauru"                     "Northern Mariana Islands" 
## [15] "Puerto Rico"               "Sint Maarten (Dutch part)"
## [17] "Somalia"                   "St. Martin (French part)" 
## [19] "Turks and Caicos Islands"  "Tuvalu"                   
## [21] "Venezuela, RB"             "Virgin Islands (U.S.)"

All these countries are either small island countries or countries with a very close authoritarian regime or countries ravaged by wars. Therefore due to the high percentage of missing data and their peculiarity I decide to get rid of them.

# Getting rid of them
reduced_df <- data |> 
  filter(!country_name %in% high_missing_countries)

I now plot the percentage of missing observations by column to see if some columns have a particularly low number of data.

plot_missing(data)

There is a worrying number of missing data especially for the variables public_debt and foreign_aid.

I now investigate more into the high percentage of missingness for the variable foreign_aid.

# Extracting the list of countries that report NAs for foreign aid
reduced_df |> 
  filter(is.na(foreign_aid)) |> 
  pull(country_name)
##  [1] "Andorra"              "Aruba"                "Australia"           
##  [4] "Austria"              "Bahamas, The"         "Bahrain"             
##  [7] "Barbados"             "Belgium"              "Bermuda"             
## [10] "Brunei Darussalam"    "Bulgaria"             "Canada"              
## [13] "Cayman Islands"       "Croatia"              "Curacao"             
## [16] "Cyprus"               "Czechia"              "Denmark"             
## [19] "Estonia"              "Finland"              "France"              
## [22] "French Polynesia"     "Germany"              "Greece"              
## [25] "Greenland"            "Hong Kong SAR, China" "Hungary"             
## [28] "Iceland"              "Ireland"              "Israel"              
## [31] "Italy"                "Japan"                "Korea, Rep."         
## [34] "Kuwait"               "Latvia"               "Lithuania"           
## [37] "Luxembourg"           "Macao SAR, China"     "Malta"               
## [40] "Netherlands"          "New Caledonia"        "New Zealand"         
## [43] "Norway"               "Oman"                 "Poland"              
## [46] "Portugal"             "Qatar"                "Romania"             
## [49] "Russian Federation"   "San Marino"           "Saudi Arabia"        
## [52] "Singapore"            "Slovak Republic"      "Slovenia"            
## [55] "Spain"                "St. Kitts and Nevis"  "Sweden"              
## [58] "Switzerland"          "Trinidad and Tobago"  "United Arab Emirates"
## [61] "United Kingdom"       "United States"

All those countries are high-income countries (sometimes even donor countries) that do not receive official development assistance therefore I can safely recode these NAs into zeros.

# Recoding foreign_aid NAs
reduced_df <- reduced_df |> 
  mutate(foreign_aid = ifelse(is.na(foreign_aid), 0, foreign_aid))

Finally I will now focus on the imputation of missing values for all variables, even for the column public_debt, that although it has a really high number of missing values, I consider important for the scope of my analysis.

First I need to decide which imputation method to use. I take the variables with the most missing values (public_debt and tourism_exports) and impute them using several methods. By comparing the original distributions of these variables with those obtained through different imputation mechanisms I will select the “best” imputation algorithm and apply it throughout the whole dataset.

# Dropping for the imputation country_code and country_name (identifier variables)
numeric <- reduced_df |> 
  select(-c("country_name", "country_code"))
identifiers <- reduced_df |> 
  select(c("country_name", "country_code"))

# Imputing public_debt with several methods
imputed_public_debt <- data.frame(
  original = numeric$public_debt,
  imputed_pmm = complete(mice(numeric, m= 1, method = "pmm", seed=123))$public_debt,
  imputed_cart = complete(mice(numeric, m = 1, method = "cart", seed=123))$public_debt,
  imputed_rf = complete(mice(numeric, m = 1, method = "rf", seed=123))$public_debt,
  imputed_norm = complete(mice(numeric, m = 1, method = "norm", seed=123))$public_debt,
  imputed_norm.boot = complete(mice(numeric, m = 1, method = "norm.boot", seed=123))$public_debt,
  imputed_lasso.norm = complete(mice(numeric, m = 1, method = "lasso.norm", seed=123))$public_debt)

# Imputing tourism exports with several methods
imputed_tourism_exports <- data.frame(
  original = numeric$tourism_exports,
  imputed_pmm = complete(mice(numeric, m= 1, method = "pmm", seed=123))$tourism_exports,
  imputed_cart = complete(mice(numeric, m = 1, method = "cart", seed=123))$tourism_exports,
  imputed_rf = complete(mice(numeric, m = 1, method = "rf", seed=123))$tourism_exports,
  imputed_norm = complete(mice(numeric, m = 1, method = "norm", seed=123))$tourism_exports,
  imputed_norm.boot = complete(mice(numeric, m = 1, method = "norm.boot", seed=123))$tourism_exports,
  imputed_lasso.norm = complete(mice(numeric, m = 1, method = "lasso.norm", seed=123))$tourism_exports)

I will now plot the obtained distributions to understand which method best resembles the original distribution.

# Plotting public_debt

# Convert data to long format for plotting
imputed_long_public_debt <- imputed_public_debt |> 
  pivot_longer(cols = everything(), names_to = "Method", values_to = "Value")

# Define color palette
colors_fill <- c(brewer.pal(6, "Set1"), "black")

# Plot with faceting
ggplot(imputed_long_public_debt, aes(x = Value, , fill = Method)) +
  geom_histogram(bins = 20, alpha = 0.7, color = "black") +
  facet_wrap(~Method, scales = "free_y") +
  scale_fill_manual(values = colors_fill) + 
  labs(title = "Comparison of Original and Imputed Distributions for Public Debt variable",
       x = "Public Debt",
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "none")

# Plotting tourism exports

imputed_long_tourism <- imputed_tourism_exports %>%
  pivot_longer(cols = everything(), names_to = "Method", values_to = "Value")

colors_fill <- c(brewer.pal(6, "Set1"), "black")

ggplot(imputed_long_tourism, aes(x = Value, fill = Method)) +
  geom_histogram(bins = 20, alpha = 0.7, color = "black") +
  facet_wrap(~Method, scales = "free_y") +
  scale_fill_manual(values = colors_fill) + 
  labs(title = "Comparison of Original and Imputed Distributions for Tourism Exports Variable",
       x = "Tourism Exports",
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "none") 

For both public_debt and tourism_exports predictive mean matching seems to be the algorithm that best resembles the original distribution so I will use it to predict the missing values for all the variables in the dataset.

# Imputing all missing observations
imputed_df <- complete(mice(numeric, m = 1, method = "pmm", seed=123))
final_df <- cbind(identifiers, imputed_df)

Descriptive analysis

print(describe(final_df), n = 26)
## # A tibble: 26 × 8
##    variable                   type     na na_pct unique    min  mean   max
##    <chr>                      <chr> <int>  <dbl>  <int>  <dbl> <dbl> <dbl>
##  1 country_name               chr       0      0    195  NA    NA     NA  
##  2 country_code               chr       0      0    195  NA    NA     NA  
##  3 agriculture_%gdp           dbl       0      0    194   0.02 10.0   37.5
##  4 industry_%gdp              dbl       0      0    195   5.52 24.9   60.8
##  5 services_%gdp              dbl       0      0    195  30.1  57.1   93.2
##  6 trade_balance              dbl       0      0    178 -73.5  -5.03  45.2
##  7 trade_openess              dbl       0      0    178  19.9  92.5  374. 
##  8 public_debt                dbl       0      0     73   7.37 62.4  200. 
##  9 public_spending            dbl       0      0    176   2.16 17.2   58.9
## 10 unemployment_total         dbl       0      0    179   0.13  7.84  26.8
## 11 unemployment_youth         dbl       0      0    179   0.43 17.6   74.1
## 12 foreign_aid                dbl       0      0    134  -0.01  3.33  53.1
## 13 natural_resources_rents    dbl       0      0    183   0     5.25  55.3
## 14 tourism_exports            dbl       0      0    166   0.07 19.8   94.2
## 15 rawagrimaterial_exports    dbl       0      0    179   0     3.69  61.5
## 16 food_exports               dbl       0      0    179   0    26.4   98.1
## 17 fuel_exports               dbl       0      0    176   0    14.9   99.9
## 18 manufactures_exports       dbl       0      0    179   0    40.5   95.8
## 19 ores_metals_exports        dbl       0      0    177   0     8.45  75.9
## 20 ict_services_exports       dbl       0      0    185   0    30.7   83.8
## 21 finance_services_exports   dbl       0      0    177   0.02  6.07  96.4
## 22 transport_services_exports dbl       0      0    184   0.29 20.5   79.2
## 23 travel_services_exports    dbl       0      0    185   1.19 43.1   94.4
## 24 female_labor_participation dbl       0      0    179   5.34 56.8   85.4
## 25 migrant_population         dbl       0      0    194   0.07  9.83  88.4
## 26 age_dependency             dbl       0      0    195   1.31 13.4   46.8

As can be seen now there are no more missing values. All my variables are percentages, but they are percentages of different things (as explained when describing the data) and they can operate on different scales. For instance trade_openess is trade as a % of GDP, but it can and sometimes it takes values above 100%. The same can happen for public_debt. trade_balance instead can take negative values.

Here is where I decide whether I need to scale my variables or not… TBW…

# Plotting the distributions of the variables with their mean
final_df |> 
  select(- c("country_name", "country_code")) |> 
  explore_all()

From the plots of the distributions of my variables I can see that most of them are right skewed, with many countries having values close to zero and long right tails. A partial exception to this are the variables trade_balance (because of its negative values) and female_labor_participation which exhibit partial left-skewness. More balanced are the distributions of industry_%gdp, services_%gdp and travel_services_exports, while manufactures_exports shows an interesting bimodal distribution.

final_df |> 
  select(- c("country_name", "country_code")) |> 
  cor() |> 
  corrplot(method = 'square', order = 'FPC', type = 'lower', diag = FALSE, tl.col = 'black')

Lastly I plot a correlation matrix from which it can be seen that there are some strong correlations, as well as numerous weak ones. For example having a high share of GDP coming from the primary sector (agriculture_%gdp) is negatively correlated with having a problem of an aging population (age_dependency), while it is positively correlated with receiving a lot of ODA (foreign_aid). This makes sense as rather undeveloped economies (relying a lot on the primary sector) are often likely to receive foreign aid and are often places where population is expanding rather than contracting and where life expectancy is not high, thus having a low age dependency ratio.

Principal Component Analysis

pca = prcomp(imputed_df, scale = TRUE)

As mentioned before, although all variables are percentages they are scaled to run principal components analysis to avoid the component weights being biased by the higher variance of some of the variables.

fviz_screeplot(pca, addlabels = TRUE)

As can be seen from the graph above the first principal component only explains about 18% of the variability of the data, the second one about 16%. The third and the forth component explain around 10% of the variability. This means that 4 components explain in total only around 55% of the variability, and it is still missing nearly one half of the information of my dataset . After the fifth component each component explain progressively less variance. It seems that the underlying structure of my data doesn’t neatly reduce to a small number of dimensions, and it’s instead highly multidimensional with no single or few patterns strongly dominating my dataset.

First principal component

# Plotting each variable relative importance to PC1
fviz_contrib(pca, choice = "var", axes = 1)

# Convert PCA rotation values into a dataframe
pca1_df <- data.frame(variable = rownames(pca$rotation), 
                      PC1 = pca$rotation[,1])
# Create a barplot with rotated labels
ggplot(pca1_df, aes(x = reorder(variable, PC1), y = PC1)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.grid.minor = element_blank()) + 
  ylim(-0.5, 0.5) +
  labs(x = "Variables", 
       y = "PC1 Loadings", 
       title = "PCA Loadings for the First Prinicpial Component")

The first principal component is already quite hard to interpret. However it seems to reflect what we would consider developed economies, with strong service sectors, integrated in the global market (open economies) a positive trade balance, exporting manufactured goods rather than raw materials. On the contrary those countries do not rely on the primary sector a lot and have a problem of an ageing population.

# Printing the bottom 10 countries for PC1
identifiers$country_name[order(pca$x[,1])][1:10]
##  [1] "Syrian Arab Republic"     "Solomon Islands"         
##  [3] "Timor-Leste"              "Yemen, Rep."             
##  [5] "Guinea-Bissau"            "Sierra Leone"            
##  [7] "Ethiopia"                 "Central African Republic"
##  [9] "Kiribati"                 "Gambia, The"
# Printing the top 10 countries for PC1
identifiers$country_name[order(pca$x[,1], decreasing=T)][1:10]
##  [1] "Luxembourg"           "Andorra"              "Hong Kong SAR, China"
##  [4] "Singapore"            "San Marino"           "Bermuda"             
##  [7] "Malta"                "Ireland"              "Japan"               
## [10] "Switzerland"

This seems to be confirmed when we print the “best” and the “worst” countries relative to the first principal component. The top countries are all small developed countries integrated in the global market in terms of trade and with an old population. The worst countries instead are poor countries relying on foreign aid, the agricultural sector or exploiting natural resources.

To conclude, the main interpretation for PC1 seems to be a score of how open, developed and service oriented an economy is and how severe is the problem of an aging population.

# Map our PCA1 index in a map. Start by creating a df containing the PC1 score
map <- data.frame(country = identifiers$country_name,
                  country_code = identifiers$country_code, 
                  value=pca$x[,1])
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
                               nameJoinColumn = "country_code")
#Draw the map
mapCountryData(mapToPlot=matched,
               nameColumnToPlot="value",
               numCats = 4,
               missingCountryCol="white",
               addLegend = TRUE,
               borderCol = "black",
               catMethod = "diverging", 
               colourPalette = "diverging",
               mapTitle = c("PCA1"), 
               lwd=0.5,
               oceanCol="lightblue")

Note that countries in white are those countries that I have deleted and for which therefore there is no data in my dataframe.

The map confirms my intuition. The first principal component is both an index of development, but also of openness and integration with the world markets of an economy, as well as an index of how worrying is the ageing problem of the population. This is why the U.S.A, are not in red unlike most of Europe and Japan.

Second principal component

# Plotting each variable relative importance to PC2
fviz_contrib(pca, choice = "var", axes = 2)

# Convert PCA rotation values into a dataframe
pca2_df <- data.frame(variable = rownames(pca$rotation), 
                      PC2 = pca$rotation[,2])
# Create a barplot with rotated labels
ggplot(pca2_df, aes(x = reorder(variable, PC2), y = PC2)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.grid.minor = element_blank()) + 
  ylim(-0.5, 0.5) +
  labs(x = "Variables", 
       y = "PC2 Loadings", 
       title = "PCA Loadings for the Second Prinicpial Component")

The second principal component seems to reflect countries with very few industries, still reliant on the tertiary sector, but especially because of tourism and therefore not necessarily developed economies (foreign aid is quite high and more sophisticated goods are not exported).

# Printing the bottom 10 countries for PC2
identifiers$country_name[order(pca$x[,2])][1:10]
##  [1] "Kuwait"            "Brunei Darussalam" "Qatar"            
##  [4] "Equatorial Guinea" "Turkmenistan"      "Iraq"             
##  [7] "Azerbaijan"        "Oman"              "Congo, Rep."      
## [10] "Papua New Guinea"
# Printing the top 10 countries for PC2
identifiers$country_name[order(pca$x[,2], decreasing=T)][1:10]
##  [1] "Palau"                          "Syrian Arab Republic"          
##  [3] "St. Lucia"                      "Grenada"                       
##  [5] "Sao Tome and Principe"          "Aruba"                         
##  [7] "Vanuatu"                        "St. Vincent and the Grenadines"
##  [9] "Maldives"                       "Dominica"

The bottom countries for the second principal component are mainly oil or gas producer countries as the component places negative emphasis on the exports of fuel related products and on the importance of natural resources rents. The top countries on the other hand are basically always small islands, not necessarily rich, but that rely almost solely on tourism (their economy is not very diversified). A quite puzzling and unexplainable presence is Syria which is ranked as one of the best countries in terms of PC2.

Apart from Syria the interpretation of PC2 seems more straightforward than PC1 and seems to be a score of how reliant a country is on tourism.

# Map our PCA2 index in a map. Start by creating a df containing the PC2 score
map <- data.frame(country = identifiers$country_name,
                  country_code = identifiers$country_code, 
                  value=pca$x[,2])
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
                               nameJoinColumn = "country_code")
#Draw the map
mapCountryData(mapToPlot=matched,
               nameColumnToPlot="value",
               numCats = 4,
               missingCountryCol="white",
               addLegend = TRUE,
               borderCol = "black",
               catMethod = "diverging", 
               colourPalette = "diverging",
               mapTitle = c("PCA2"), 
               lwd=0.5,
               oceanCol="lightblue")

The map does not do justice to this second principal component because its top countries are small island that cannot be seen very well. However, we can notice how mediterranean countries that rely on tourism more than their neighbors have a higher score than Northern Europe.

Third principal component

# Plotting each variable relative importance to PC3
fviz_contrib(pca, choice = "var", axes = 3)

# Convert PCA rotation values into a dataframe
pca3_df <- data.frame(variable = rownames(pca$rotation), 
                      PC3 = pca$rotation[,3])
# Create a barplot with rotated labels
ggplot(pca3_df, aes(x = reorder(variable, PC3), y = PC3)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.grid.minor = element_blank()) + 
  ylim(-0.5, 0.5) +
  labs(x = "Variables", 
       y = "PC3 Loadings", 
       title = "PCA Loadings for the Third Prinicpial Component")

The third principal component places a lot of negative weight to unemployment levels which are basically the most defining factor of this component. It also places a positive importance to female labour participation. These countries however also seem to have a relatively undeveloped economy relying mainly on agriculture and are not oil producing countries.

# Printing the bottom 10 countries for PC3
identifiers$country_name[order(pca$x[,3])][1:10]
##  [1] "Libya"             "Iraq"              "Djibouti"         
##  [4] "Saudi Arabia"      "Angola"            "South Africa"     
##  [7] "Brunei Darussalam" "Congo, Rep."       "Jordan"           
## [10] "Kosovo"
# Printing the top 10 countries for PC3
identifiers$country_name[order(pca$x[,3], decreasing=T)][1:10]
##  [1] "Liberia"         "Benin"           "Solomon Islands" "Burundi"        
##  [5] "Guinea-Bissau"   "Mali"            "Sierra Leone"    "Japan"          
##  [9] "Moldova"         "Tanzania"

Although some countries might be unexpected the list of the top and worse countries is coherent with the definition of the third principal component. All the top countries are countries with very low level of unemployment, while the worst countries have high level of unemployment and often are oil producing countries.

Fourth principal component

# Plotting each variable relative importance to PC4
fviz_contrib(pca, choice = "var", axes = 4)

# Convert PCA rotation values into a dataframe
pca4_df <- data.frame(variable = rownames(pca$rotation), 
                      PC4 = pca$rotation[,4])
# Create a barplot with rotated labels
ggplot(pca4_df, aes(x = reorder(variable, PC4), y = PC4)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.grid.minor = element_blank()) + 
  ylim(-0.5, 0.5) +
  labs(x = "Variables", 
       y = "PC2 Loadings", 
       title = "PCA Loadings for the Fourth Prinicpial Component")

The fourth component places strong negative emphasis on the percentage of foreigners in the population, while placing strong positive emphasis on manufactures exports and ict related service exports. Unemployment in such countries is quite high.

# Printing the bottom 10 countries for PC4
identifiers$country_name[order(pca$x[,4])][1:10]
##  [1] "Macao SAR, China"     "Qatar"                "United Arab Emirates"
##  [4] "Andorra"              "Bahrain"              "Kuwait"              
##  [7] "Liberia"              "Luxembourg"           "Timor-Leste"         
## [10] "Antigua and Barbuda"
# Printing the top 10 countries for PC4
identifiers$country_name[order(pca$x[,4], decreasing=T)][1:10]
##  [1] "North Macedonia"        "West Bank and Gaza"     "Yemen, Rep."           
##  [4] "India"                  "Djibouti"               "Bosnia and Herzegovina"
##  [7] "Serbia"                 "Bangladesh"             "Pakistan"              
## [10] "Nepal"

The list of the top and bottom countries in terms of PC4 helps us understanding it better. The bottom countries are countries that are attractive for foreign workers (gulf countries or small European countries such as Luxembourg and Andorra). The top countries instead are countries that do not attract migrants (wages are generally low) and their economy is developing. Still while to the first three components it could be assigned a general meaning for this last one it is more difficult.

Summary

As the interpretation of the fourth component becomes more difficult and the relative importance of the successive components decreases below 10% of the variability I choose to stop there.

To recap

  1. First principal component seems a score of how developed and integrated an economy is, as well as the gravity of an ageing population

  2. Second component seems a score of how reliant on tourism an economy is vs how reliant a country is on oil exports

  3. Third component seems a to mainly reflect unemployment levels in the countries

  4. Fourth component seems to reflect how attractive to foreign workers a country is, but the interpretation becomes more difficult

# First vs second PC
data.frame(z1=pca$x[,1], z2=pca$x[,2]) |> 
  ggplot(aes(z1, z2, label=identifiers$country_name)) + 
  geom_point(size=0) +
  labs(title="First and second principal components", x="PC1", y="PC2") +
  theme_bw() +
  theme(legend.position="bottom") + 
  geom_text(size=3, hjust=0.6, vjust=0, check_overlap = TRUE) 

# First vs third PC
data.frame(z1=pca$x[,1], z3=pca$x[,3]) |> 
  ggplot(aes(z1, z3, label=identifiers$country_name)) + 
  geom_point(size=0.3) +
  labs(title="First and third principal components (scores)", x="PC1", y="PC3") +
  theme_bw() +
  theme(legend.position="bottom") + 
  geom_text(size=3, hjust=0.6, vjust=0, check_overlap = TRUE)

Clustering

K-Means

First let’s try to determine the optimal number of clusters using different methods.

fviz_nbclust(scale(imputed_df), kmeans, method = 'wss', k.max = 10, nstart = 1000)

fviz_nbclust(scale(imputed_df), kmeans, method = 'silhouette', k.max = 10, nstart = 1000)

fviz_nbclust(scale(imputed_df), kmeans, method = 'gap_stat', k.max = 10, nstart = 100, nboot = 100, verbose = FALSE)

As we can see the wss method is not really helpful as it does not show any sudden flattening of the curve. The curve only progressively flattens without giving us any clear hint to the optimal number of clusters.

The silhoutte method suggest 7 clusters, while the gap_stat suggests 9 clusters. For this reason As seven clusters are already quite a lot, I will opt to have 7. However, this high number of clusters and the fact that we do not see any clear pattern in the wss graph further suggests me that my dataframe is not easily explainable by a low number of clusters and that the world’s economies structures present many differences among them.

# Running the clustering
set.seed(123)
fit <- kmeans(scale(imputed_df), iter.max = 1000, centers = 7, nstart = 10000)

# Number of observations by cluster
groups = fit$cluster
barplot(table(groups), col = "blue")

As can be seen the cluster n.7 and cluster n.1 have more than 50 observations while some other clusters (like 2 and 6) have less than 10 observations.

I will now plot the characteristics of the center of the cluster to try to understand how the countries are being clustered.

# Extract centers data
centers_df <- as.data.frame(fit$centers)

# Add cluster labels
centers_df$Cluster <- factor(1:nrow(centers_df))  

# Convert to long format for ggplot
centers_df <- centers_df %>%
  pivot_longer(-Cluster, names_to = "Variable", values_to = "Value")

# Plot using ggplot
ggplot(centers_df, aes(x = reorder_within(Variable, Value, Cluster), 
                       y = Value, fill = Cluster)) +
  geom_col() +
  facet_wrap(~ Cluster, scales = "free_x") +  
  labs(title = "Cluster Centers", x = "Variable", y = "Value") +
  scale_x_reordered() +
  theme_minimal() +
  scale_fill_manual(values = brewer.pal(7, "Set1")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5),
        panel.grid.minor = element_blank())

  • Cluster 1 seems to be incorporating countries where age_dependency is quite high and agriculture has a relatively unimportant contribution to gdp (agriculture_%gdp). It is difficult to define these countries precisely, but they are likely to be developed diversified economies (foreign_aid is low, female_labor_participation is high), that integrated in the global markets and do not rely on natural resources too much
  • Cluster 2 seems to be grouping a particular type of countries that place high importance on the export of agricultural raw materials (rawagrimaterial_exports). The interpretation of this cluster is difficult, but it should also be noted that very few (probably quite peculiar) countries are part of it
  • Cluster 3 seems to capture countries that rely either on the primary sector through food_exports or on the tertiary sector through tourism_exports. However these countries are likely not to be high income countries: foreign_aid is high and their secondary sector is rather undeveloped (industry_%gdp) eventually suffering a trade deficit (trade_balance). These countries are probably small islands
  • Cluster 4 seems to mainly be characterized by high levels of unemployment, (unemployment_total, unemployment_young). These countries however are likely to quite developed and place a lot of importance on the tourism sector (tourism_exports)
  • Cluster 5 seems to group countries that rely on natural_resources_rents (and fuel_exports). The fact that migrant_population is high makes me guess that these countries are Gulf countries.
  • Cluster 6 seems to be made up of countries with a developed economy and that are attractive to a foreigners (migrant_population). For these countries the tertiary sector is really important (services_%gdp) and they are usually really integrated and open economies (trade_openess and trade_balance). These are probably mainly small, high-income countries, often being global financial centers as well (finance_services_exports)
  • Cluster 7 does not place a lot of emphasis on any of the variables present. However, they are mainly agricultural economies (agriculture_%gdp) rather than service economies (services_%gdp). As age_dependency is low and foreign_aid is high, these countries are likely to be low-income countries probably located mainly in Africa

Let’s try to understand better which countries form each cluster by using a map and a clusplot.

# Plotting k-means clustering in a map
map <- data.frame(country=identifiers$country_name, country_code=identifiers$country_code, value=fit$cluster)

#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
                               nameJoinColumn = "country_code")
#Drawing the map
mapCountryData(mapToPlot=matched,
               nameColumnToPlot="value",
               numCats = 7,
               missingCountryCol="white",
               addLegend = TRUE,
               borderCol = "black",
               catMethod = "categorical", 
               colourPalette = brewer.pal(7, "Set1"),
               mapTitle = c("7-means Clusters"), 
               lwd=0.5,
               oceanCol="lightblue")

#Drawing a clusplot
fviz_cluster(fit, data = imputed_df, geom = c("point"), ellipse.type = 'norm', pointsize = 0.05) +
  theme_minimal() +
  geom_text(label = identifiers$country_name, hjust = 0, vjust = 0,size = 2,check_overlap = TRUE) +
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1")
## Too few points to calculate an ellipse

map |> 
  filter(value==2) |> 
  pull(country)
## [1] "Benin"           "Liberia"         "Solomon Islands"

From the map it can be seen that cluster 1 is formed by basically all the high and middle income countries of the world. It is a really vast group of countries that probably have really diversified economies and they do not rely specifically on any of the sector or on a particular type of exports.

It is really difficult to individuate cluster 2 countries, therefore I have decided to print them directly. These are a very small and peculiar group for which it is difficult to give a final explanation.

Cluster 3 countries are the ones with the highest score for PC2 (score of how reliant on tourism an economy is), but that are not very rich countries. These are again mainly small island countries (from the map it is very difficult to see them because of this).

Cluster 4 countries are mainly mediterranean countries that rely on tourism, but where unemployment is quite high plus some relatively developed countries in the South of Africa.

Cluster 5 is made up of the oil and gas producing countries especially in the Middle East, Central Asia, but also Northern and Western Africa.

Cluster 6 countries are not visible in the map because they are usually small high-income countries. They are often financial centers and they are well integrated in the global economy. These countries score really high on the first principal component.

Finally, cluster 7 seems to incorporate all the remaining middle and low income countries without any of the previous specifications. Middle income countries that end up in this cluster are probably more agricultural rather than service-based economies. They do not rely on a particular type of export.

Hierarchical

Even for hierarchical clustering I will group the world’s economies into 7 clusters for consistency for what has been done above.

# Compute the Euclidean distance matrix using the scaled data
d <- dist(scale(imputed_df), method = "euclidean") 
# Perform hierarchical clustering using Ward's D2 method
hc <- hclust(d, method = "ward.D2")

# Assign country names as labels
hc$labels <- identifiers$country_name

# Visualize the dendrogram with 7 clusters
fviz_dend(x = hc, 
          k=7,
          rect = TRUE, 
          rect_fill = TRUE, 
          cex=0.5,
          rect_border = "d3",
          palette = "d3")

From this graph it is difficult to understand a lot. However, it can be seen that the leftmost cluster is again made up of oil and gas producing countries in Africa, the Middle East and Central Asia.

The yellow group seems made up of mostly middle income countries, but some low and high income countries are present as well. It is a very big group of countries and it is very difficult to define precisely.

Than there is again the group of 3 particular countries made up by Solomon Islands, Benin and Liberia, for which as discussed above interpretation is difficult.

The red cluster seems to be a group of the poorest countries in the world. These countries are also probably relying a lot on foreign aid.

The light-blue group is made up of mainly Western countries (especially European countries).

The second rightmost cluster is made up of other middle-income countries, mainly in Eastern Europe, Africa, but some in Central America as well.

The last group on the right is made up of small states that often are islands.

Let’s see another visualization as well:

# Drawing a phylogenic tree
fviz_dend(x = hc,
          k = 7,
          color_labels_by_k = TRUE,
          cex = 0.5,
          palette = "d3",
          type = "phylogenic",
          repel = TRUE) +  
  labs(title = "Phylogenic tree of clustered world's economies") + 
  theme(axis.text.x = element_blank(), 
        axis.text.y = element_blank())

And finally plot it on a map:

#Assign each observation to one of the 7 clusters based on the hierarchical clustering
groups.hc = cutree(hc, k = 7)

# Map the hierarchical cluster into a map
map <- data.frame(country=identifiers$country_name, country_code=identifiers$country_code, value=groups.hc)
# Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
                               nameJoinColumn = "country_code")
# Draw the map
mapCountryData(mapToPlot=matched,
               nameColumnToPlot="value",
               numCats = 7,
               missingCountryCol="white",
               addLegend = TRUE,
               borderCol = "black",
               catMethod = "categorical", 
               colourPalette = pal_d3("category10")(7),
               mapTitle = c("Hierarchical Clusters"), 
               lwd=0.5,
               oceanCol="lightblue")

Conclusions